import os
import pandas as pd
import numpy as np

# 1. 路径配置
MEDIA_FOLDER   = "Media_Monthly_Counts_Quad&Event (Added)"
BOTH_DATA_FILE = "Monthly_Counts_Quad&Event.xlsx"
OUTPUT_FILE    = "Hellinger_Quad&Event.xlsx"

# 2. 构造完整月份索引（2014-01 至 2023-12）
FULL_MONTHS = pd.period_range("2014-01", "2023-12", freq="M")

# 3. 通用加载函数
def load_counts(path, sheet, infer_format=True):
    df = pd.read_excel(path, sheet_name=sheet, engine="openpyxl")
    s = df['Month'].astype(str).str.strip()
    if infer_format:
        dt = pd.to_datetime(s, infer_datetime_format=True, errors='coerce')
    else:
        dt = pd.to_datetime(s, format="%Y-%m", errors='coerce')
    df['Period'] = dt.dt.to_period('M')
    df = df.drop(columns=['Month']).set_index('Period')
    df = df.apply(pd.to_numeric, errors='coerce').fillna(0)
    df = df.groupby(df.index).sum()
    df = df.reindex(FULL_MONTHS, fill_value=0)
    return df

# 4. Hellinger 距离函数
def hellinger(p, q):
    return np.sqrt(0.5 * np.sum((np.sqrt(p) - np.sqrt(q)) ** 2))

# 5. 读取全量分布
glob_q = load_counts(BOTH_DATA_FILE, "QuadClass", infer_format=False)
glob_e = load_counts(BOTH_DATA_FILE, "EventRootCode", infer_format=False)

quad_cols = glob_q.columns
evt_cols  = glob_e.columns

epsilon = 1e-8

# 6. 主循环
results = []

for fname in os.listdir(MEDIA_FOLDER):
    if not fname.lower().endswith(".xlsx"):
        continue

    media = os.path.splitext(fname)[0]
    path  = os.path.join(MEDIA_FOLDER, fname)

    mq = load_counts(path, "QuadClass")
    me = load_counts(path, "EventRootCode")

    mq = mq.reindex(columns=quad_cols, fill_value=0)
    me = me.reindex(columns=evt_cols,  fill_value=0)

    # 概率化
    pq = (mq + epsilon).div((mq + epsilon).sum(axis=1), axis=0)
    pe = (me + epsilon).div((me + epsilon).sum(axis=1), axis=0)
    gq = (glob_q + epsilon).div((glob_q + epsilon).sum(axis=1), axis=0)
    ge = (glob_e + epsilon).div((glob_e + epsilon).sum(axis=1), axis=0)

    common_q = pq.index.intersection(gq.index)
    common_e = pe.index.intersection(ge.index)

    hq = [hellinger(pq.loc[m].values, gq.loc[m].values) for m in common_q]
    he = [hellinger(pe.loc[m].values, ge.loc[m].values) for m in common_e]

    mean_q = float(np.nanmean(hq)) if hq else np.nan
    mean_e = float(np.nanmean(he)) if he else np.nan

    results.append({
        "Media":       media,
        "Hellinger_Quad":  mean_q,
        "Hellinger_Event": mean_e,
        "Hellinger_Mean":  np.nanmean([mean_q, mean_e])
    })

# 7. 输出
pd.DataFrame(results).set_index("Media").to_excel(OUTPUT_FILE)
print(f"✅ 完成！Hellinger 距离结果已保存至 {OUTPUT_FILE}")
